Use this page to describe the results in your own words. Some things to think about:
---
title: "DIYtranscriptomics"
output:
flexdashboard::flex_dashboard:
orientation: rows
source_code: embed
---
```{r setup, include=FALSE}
library(tidyverse)
library(tximport)
library(biomaRt)
library(RColorBrewer)
library(reshape2)
library(genefilter)
library(edgeR)
library(matrixStats)
library(hrbrthemes)
library(gplots)
library(limma)
library(heatmaply)
library(d3heatmap)
library(DT)
library(gt)
library(plotly)
knitr::opts_chunk$set(message = FALSE)
```
Overview {data-icon="fa-signal"}
=====================================
Row {data-height=400}
------------------------------------------------------------------------------
### Signal distribution: raw data
```{r raw data}
# importing data into R using Tximport package
targets <- read_tsv("../../studyDesign.txt")# read in your study design
path <- file.path("../readMapping", targets$sample, "abundance.h5") # set file paths to your mapped data
targets <- mutate(targets, path) # add paths to your study design (only necessary for Sleuth)
Hs.anno <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl") # select 'mart' from biomaRt for annotations
Tx <- getBM(attributes=c('ensembl_transcript_id_version', # get gene symbols for each transcript ID
'external_gene_name'),
mart = Hs.anno)
Tx <- as_tibble(Tx) # convert this annotation mapping file to a tibble (the tidyverse version of a dataframe)
Txi_gene <- tximport(path, #reading kallisto data into R
type = "kallisto",
tx2gene = Tx,
txOut = FALSE,
countsFromAbundance = "lengthScaledTPM")
myCPM <- as_tibble(Txi_gene$abundance, rownames = "geneSymbol") # these are you counts after adjusting for transcript length
myCounts <- as_tibble(Txi_gene$counts, rownames = "geneSymbol") # these are your transcript per million (TPM) values, or counts per million (CPM) if you collapsed data to gene level
groups1 <- targets$treatment
groups1 <- factor(groups1)
sampleLabels <- targets$sample
myDGEList <- DGEList(Txi_gene$counts)
log2.cpm <- cpm(myDGEList, log=TRUE)
nsamples <- ncol(log2.cpm)
myColors <- brewer.pal(nsamples, "Paired")
log2.cpm.df <- as_tibble(log2.cpm)
colnames(log2.cpm.df) <- sampleLabels
log2.cpm.df.melt <- melt(log2.cpm.df)
ggplot(log2.cpm.df.melt, aes(x=variable, y=value, fill=variable)) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun.y = "median", geom = "point", shape = 124, size = 6, color = "black", show.legend = FALSE) +
labs(y="log2 expression", x = "sample") +
#title="Log2 Counts per Million (CPM)",
#subtitle="unfiltered, non-normalized",
#caption=paste0("produced on ", Sys.time())) +
coord_flip() +
theme_ipsum_rc()
```
### Signal distribution: filtered and normalized data
```{r processed data}
cpm <- cpm(myDGEList)
keepers <- rowSums(cpm>1)>=3 #user defined
myDGEList.filtered <- myDGEList[keepers,]
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")
log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)
log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm)
colnames(log2.cpm.filtered.norm.df) <- sampleLabels
log2.cpm.filtered.norm.df.melt <- melt(log2.cpm.filtered.norm.df)
ggplot(log2.cpm.filtered.norm.df.melt, aes(x=variable, y=value, fill=variable)) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun.y = "median", geom = "point", shape = 124, size = 6, color = "black", show.legend = FALSE) +
labs(y="log2 expression", x = "sample") +
#title="Log2 Counts per Million (CPM)",
#subtitle="filtered, TMM normalized",
#caption=paste0("produced on ", Sys.time())) +
coord_flip() +
theme_ipsum_rc()
```
### PCA plot
```{r PCA}
pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
pc.var<-pca.res$sdev^2
pc.per<-round(pc.var/sum(pc.var)*100, 1)
pca.res.df <- as_tibble(pca.res$x)
ggplot(pca.res.df, aes(x=PC1, y=PC3, color=groups1)) +
geom_point(size=5) +
xlab(paste0("PC1 (",pc.per[1],"%",")")) +
ylab(paste0("PC3 (",pc.per[3],"%",")")) +
theme_ipsum_rc() +
theme(legend.title=element_blank())
```
Row {data-height=400}
------------------------------------------------------------------------------
### Filtered and normalized counts per million (CPM)
```{r filtered and normalized data}
mydata.df <- as_tibble(log2.cpm.filtered.norm, rownames = "geneSymbol")
colnames(mydata.df) <- c("geneSymbol", sampleLabels)
#user defined
mydata.df <- mutate(mydata.df,
uninfected.AVG = (uninf_rep1 + uninf_rep2 + uninf_rep1)/3,
crypto.wt.AVG = (crypto.wt_rep1 + crypto.wt_rep2 + crypto.wt_rep3)/3,
crypto.mut.AVG = (crypto.mut_rep1 + crypto.mut_rep2 + crypto.mut_rep3)/3,
#now make columns comparing each of the averages above that you're interested in
LogFC.crypto.wt_vs_uninfected = (crypto.wt.AVG - uninfected.AVG),
LogFC.crypto.mut_vs_uninfected = (crypto.mut.AVG - uninfected.AVG)) %>%
mutate_if(is.numeric, round, 2)
mydata.sort <- mydata.df %>%
dplyr::arrange(desc(LogFC.crypto.wt_vs_uninfected)) %>%
dplyr::select(geneSymbol, uninfected.AVG, crypto.wt.AVG, LogFC.crypto.wt_vs_uninfected)
datatable(mydata.df,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 5, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(1:6), digits=2)
```
DEGs {data-orientation=columns data-icon="fa-random"}
=====================================
Column {data-width=500}
------------------------------------------------------------------------------
### Volcano plot
```{r volcano plot}
design <- model.matrix(~0 + groups1)
colnames(design) <- levels(groups1)
v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = FALSE)
contrast.matrix <- makeContrasts(infection_with_WT = crypto.wt - uninfected,
infection_with_Mut = crypto.mut - uninfected,
levels=design)
fit <- lmFit(v.DEGList.filtered.norm, design)
fits <- contrasts.fit(fit, contrast.matrix)
ebFit <- eBayes(fits)
myTopHits <- topTable(ebFit, adjust ="BH", coef=1, number=40000, sort.by="logFC")
myTopHits <- as_tibble(myTopHits, rownames = "geneSymbol")
ggplotly(ggplot(myTopHits, aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneSymbol))) +
geom_point(size=2) +
ylim(-0.5,12) +
geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) +
geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) +
geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) +
theme_ipsum_rc())
```
Column {.tabset data-width=500}
------------------------------------------------------------------------------
### heatmap
```{r heatmap}
#heatmap of all DEGs
#create additional heatmaps for co-expressed modules or a priori list of genes
design <- model.matrix(~0 + groups1)
colnames(design) <- levels(groups1)
v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = FALSE)
fit <- lmFit(v.DEGList.filtered.norm, design)
contrast.matrix <- makeContrasts(infection_with_WT = crypto.wt - uninfected,
infection_with_Mut = crypto.mut - uninfected,
levels=design)
fits <- contrasts.fit(fit, contrast.matrix)
ebFit <- eBayes(fits)
results <- decideTests(ebFit, method="global", adjust.method="BH", p.value=0.01, lfc=2)
colnames(v.DEGList.filtered.norm$E) <- sampleLabels
diffGenes <- v.DEGList.filtered.norm$E[results[,1] !=0 | results[,2] !=0,]
myheatcolors2 <- colorRampPalette(colors=c("yellow","white","blue"))(100)
clustRows <- hclust(as.dist(1-cor(t(diffGenes), method="pearson")), method="complete") #cluster rows by pearson correlation
clustColumns <- hclust(as.dist(1-cor(diffGenes, method="spearman")), method="complete")
module.assign <- cutree(clustRows, k=2)
module.color <- rainbow(length(unique(module.assign)), start=0.1, end=0.9)
module.color <- module.color[as.vector(module.assign)]
d3heatmap(diffGenes,
colors = myheatcolors2,
Rowv=as.dendrogram(clustRows),
row_side_colors = module.color,
dendrogram = "none",
scale='row'
)
```
### DEG table
```{r DEG table}
diffGenes.df <- as_tibble(diffGenes, rownames = "geneSymbol")
diffGenes.df <- mutate(diffGenes.df,
uninfected.AVG = (uninf_rep1 + uninf_rep2 + uninf_rep1)/3,
crypto.wt.AVG = (crypto.wt_rep1 + crypto.wt_rep2 + crypto.wt_rep3)/3,
crypto.mut.AVG = (crypto.mut_rep1 + crypto.mut_rep2 + crypto.mut_rep3)/3,
#now make columns comparing each of the averages above that you're interested in
LogFC = (crypto.wt.AVG - uninfected.AVG),
LogFC.crypto.mut_vs_uninfected = (crypto.mut.AVG - uninfected.AVG)) %>%
mutate_if(is.numeric, round, 2)
diffGenes.sort <- diffGenes.df %>%
dplyr::arrange(desc(LogFC)) %>%
dplyr::select(geneSymbol, uninfected.AVG, crypto.wt.AVG, LogFC)
datatable(diffGenes.sort,
extensions = c('KeyTable', "FixedHeader"),
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 25, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(1:6), digits=2)
```
About {data-orientation=rows data-icon="fa-comment-alt"}
=====================================
Row {data-height=1000}
------------------------------------------------------------------------------
Use this page to describe the results in your own words. Some things to think about:
* What are the key takeaways from the analysis?
* What types of analyses would you want to do next?
* Based on your analysis, what wet-lab experiments would you pursue?
* How could you expand on or otherwise enhance this data dashboard?